setwd("/Users/ab6415/Documents/Work_laptop_stuff/Bioinformatics/Analysis/EpiMALs_PAPER_ANALYSES_Dryad/08_TE_analysis_M1/")

library(ggplot2)
library(MASS)
library(viridis)
## Loading required package: viridisLite
library(msir)
## Package 'msir' version 1.3.2
## Type 'citation("msir")' for citing this R package in publications.
overall_sd<-function(row){
sqsum=0
ncomps=0
for (i in seq(length(row))){
  for (j in seq(length(row))){
    if (i>j){
    sqsum=sqsum+(row[j]-row[i])**2
    ncomps=ncomps+1
    }}} 
return(sqrt(sqsum/ncomps))
}

overall_sd_tendatapoints<-function(row){
sqsum=vector()
ncomps=vector()
for (i in seq(length(row))){
  for (j in seq(length(row))){
    if (i>j){
    sqsum=c(sqsum,(row[j]-row[i])**2)
    }}} 
sqsum_sampled=sample(sqsum,size=10, replace = FALSE, prob = NULL)
sqsum<-sum(sqsum_sampled)
return(sqrt(sqsum/10))
}

intergenerational_sd<-function(row){
sqsum=0
ncomps=0
for (i in seq(length(row)-1)){
    if (i != 6){
    sqsum=sqsum+(row[i+1]-row[i])**2
    ncomps=ncomps+1
    }}
return(sqrt(sqsum/ncomps))
}

#density
get_density <- function(x, y, ...) {
  dens <- MASS::kde2d(x, y, ...)
  ix <- findInterval(x, dens$x)
  iy <- findInterval(y, dens$y)
  ii <- cbind(ix, iy)
  return(dens$z[ii])
}


loess_fit_and_plotres<-function(df,x,y,thr_type,thr){

dat<-df
#dat<-dat[which(dat[,"log10_mean"]>0),]
  
l<-loess.sd(x = dat[,x],y = dat[,y], nsigma = 1.96)

l_fit<-data.frame(x=l$x,y=l$y,sd=l$sd,upper=l$upper,lower=l$lower,ID=dat$ID)
l_fit$Z<-(dat[,y]-l_fit$y)/l_fit$sd
l_fit$pvalue<-pnorm(l_fit$Z,lower.tail = FALSE)
l_fit$ID<-dat$ID

if (thr_type=="fdr"){
l_fit_padj<-l_fit[which(l_fit$x>0.9999999),]
l_fit_padj$padj<-p.adjust(l_fit_padj$pvalue,method = "fdr")
dat$sig<-rep(0,nrow(dat))
dat[which(dat$ID %in% l_fit_padj[which(l_fit_padj$padj<thr),"ID"]),"sig"]<-1
}

else if (thr_type=="pval"){
dat$sig<-rep(0,nrow(dat))
dat[which(dat$ID %in% l_fit[which(l_fit$pvalue<thr),"ID"]),"sig"]<-1
dat[which(dat$log10_mean<1),"sig"]<-0
}

dat$density<-get_density(dat[,x],dat[,y], n = 100)
print(ggplot(dat)+geom_point(aes(dat[,x],dat[,y],color=density))+ scale_color_viridis()+geom_line(aes(x=l_fit$x,y=l_fit$lower),color="pink",linetype="dashed")+geom_line(aes(x=l_fit$x,y=l_fit$y),color="pink")+geom_line(aes(x=l_fit$x,y=l_fit$upper),color="pink",linetype="dashed")+geom_point(data=subset(dat,sig==1),aes(dat[which(dat$sig==1),x],dat[which(dat$sig==1),y]),color="red")+geom_hline(yintercept = -0.6,linetype="dashed")+ylab(y)+xlab(x))

dat<-merge(dat,l_fit[,c("pvalue","ID","Z")],by="ID")

return(dat[which(dat$sig==1),])

}
setwd("/Users/ab6415/Documents/Work_laptop_stuff/Bioinformatics/Analysis/EpiMALs_PAPER_ANALYSES_Dryad/08_TE_analysis_M1/")

ttg_DEseqnorm<-read.table("22G_counts/final_counts_table/all_counts_M1_deseqnorm_averaged.txt")
ttg_A_DEseqnorm<-ttg_DEseqnorm[,1:12]

dat_ttgA<-data.frame(mean=apply(ttg_A_DEseqnorm,MARGIN = 1,FUN=mean),
                     sd=apply(ttg_A_DEseqnorm,MARGIN = 1,FUN=sd),
                     overall_sd=apply(ttg_A_DEseqnorm,MARGIN = 1,FUN=overall_sd),
                     inter_sd=apply(ttg_A_DEseqnorm,MARGIN = 1,FUN=intergenerational_sd),
                     subsampled_overall_sd=apply(ttg_A_DEseqnorm,MARGIN = 1,FUN=overall_sd_tendatapoints),
                     ID=rownames(ttg_A_DEseqnorm),
                     max=apply(ttg_A_DEseqnorm,MARGIN = 1,FUN = max))
dat_ttgA<-dat_ttgA[which(dat_ttgA$mean>0),]

#calculate log10cv2 values
dat_ttgA$log10_overall_cv2<-log10((dat_ttgA$overall_sd/dat_ttgA$mean)**2)
dat_ttgA$log10_inter_cv2<-log10((dat_ttgA$inter_sd/dat_ttgA$mean)**2)
dat_ttgA$log10_overallsub_cv2<-log10((dat_ttgA$subsampled_overall_sd/dat_ttgA$mean)**2)
dat_ttgA$log10_mean<-log10(dat_ttgA$mean+1)
dat_ttgA$log10_cv2<-log10((dat_ttgA$sd/dat_ttgA$mean)**2)
dat_ttgA$ff<-(dat_ttgA$overall_sd**2)/dat_ttgA$mean


dat_ttgA<-dat_ttgA[order(dat_ttgA$log10_mean,decreasing = FALSE),]


#plot
dat_ttgA$density<-get_density(dat_ttgA$log10_mean,dat_ttgA$log10_overall_cv2, n = 100)
ggplot(dat_ttgA)+geom_point(aes(log10_mean,log10_overall_cv2,color=density))+ scale_color_viridis()+ylim(-3,2)+ggtitle("lineage A, overall cv2")

dat_ttgA_noInf<-dat_ttgA[-which(dat_ttgA$log10_overallsub_cv2== (-Inf)),]
dat_ttgA_noInf$density<-get_density(dat_ttgA_noInf$log10_mean,dat_ttgA_noInf$log10_overallsub_cv2, n = 100)
ggplot(dat_ttgA_noInf)+geom_point(aes(log10_mean,log10_overallsub_cv2,color=density))+ scale_color_viridis()+ylim(-3,2)+ggtitle("lineage A, overall cv2, subsampled")

dat_ttgA$density<-get_density(dat_ttgA$log10_mean,dat_ttgA$log10_inter_cv2, n = 100)
ggplot(dat_ttgA)+geom_point(aes(log10_mean,log10_inter_cv2,color=density))+ scale_color_viridis()+ylim(-3,2)+ggtitle("lineage A, intergenerational cv2")
## Warning: Removed 1 rows containing missing values (geom_point).

ttgA_fdr0.2<-loess_fit_and_plotres(dat_ttgA,"log10_mean","log10_overall_cv2","fdr",0.2)

ttgA_pvalue0.01<-loess_fit_and_plotres(dat_ttgA,"log10_mean","log10_overall_cv2","pval",0.01)

ttg_B_DEseqnorm<-ttg_DEseqnorm[,13:24]

dat_ttgB<-data.frame(mean=apply(ttg_B_DEseqnorm,MARGIN = 1,FUN=mean),
                     sd=apply(ttg_B_DEseqnorm,MARGIN = 1,FUN=sd),
                     overall_sd=apply(ttg_B_DEseqnorm,MARGIN = 1,FUN=overall_sd),
                     inter_sd=apply(ttg_B_DEseqnorm,MARGIN = 1,FUN=intergenerational_sd),
                     subsampled_overall_sd=apply(ttg_B_DEseqnorm,MARGIN = 1,FUN=overall_sd_tendatapoints),
                     ID=rownames(ttg_B_DEseqnorm),
                     max=apply(ttg_B_DEseqnorm,MARGIN = 1,FUN = max))
dat_ttgB<-dat_ttgB[which(dat_ttgB$mean>0),]

#calculate log10cv2 values
dat_ttgB$log10_overall_cv2<-log10((dat_ttgB$overall_sd/dat_ttgB$mean)**2)
dat_ttgB$log10_inter_cv2<-log10((dat_ttgB$inter_sd/dat_ttgB$mean)**2)
dat_ttgB$log10_overallsub_cv2<-log10((dat_ttgB$subsampled_overall_sd/dat_ttgB$mean)**2)
dat_ttgB$log10_mean<-log10(dat_ttgB$mean+1)
dat_ttgB$log10_cv2<-log10((dat_ttgB$sd/dat_ttgB$mean)**2)
dat_ttgB$ff<-(dat_ttgB$overall_sd**2)/dat_ttgB$mean

dat_ttgB<-dat_ttgB[order(dat_ttgB$log10_mean,decreasing = FALSE),]


#plot
dat_ttgB$density<-get_density(dat_ttgB$log10_mean,dat_ttgB$log10_overall_cv2, n = 100)
ggplot(dat_ttgB)+geom_point(aes(log10_mean,log10_overall_cv2,color=density))+ scale_color_viridis()+ylim(-3,2)+ggtitle("lineage A, overall cv2")

dat_ttgB_noInf<-dat_ttgB[-which(dat_ttgB$log10_overallsub_cv2== (-Inf)),]
dat_ttgB_noInf$density<-get_density(dat_ttgB_noInf$log10_mean,dat_ttgB_noInf$log10_overallsub_cv2, n = 100)
ggplot(dat_ttgB_noInf)+geom_point(aes(log10_mean,log10_overallsub_cv2,color=density))+ scale_color_viridis()+ylim(-3,2)+ggtitle("lineage A, overall cv2, subsampled")

dat_ttgB$density<-get_density(dat_ttgB$log10_mean,dat_ttgB$log10_inter_cv2, n = 100)
ggplot(dat_ttgB)+geom_point(aes(log10_mean,log10_inter_cv2,color=density))+ scale_color_viridis()+ylim(-3,2)+ggtitle("lineage A, intergenerational cv2")

ttgB_fdr0.2<-loess_fit_and_plotres(dat_ttgB,"log10_mean","log10_overall_cv2","fdr",0.2)

ttgB_pvalue0.01<-loess_fit_and_plotres(dat_ttgB,"log10_mean","log10_overall_cv2","pval",0.01)

length(ttgA_fdr0.2$ID); length(ttgB_fdr0.2$ID)
## [1] 177
## [1] 116
length(intersect(ttgA_fdr0.2$ID,ttgB_fdr0.2$ID))
## [1] 39
length(ttgA_pvalue0.01$ID); length(ttgB_pvalue0.01$ID)
## [1] 278
## [1] 251
length(intersect(ttgA_pvalue0.01$ID,ttgB_pvalue0.01$ID))
## [1] 90
write.table(ttgA_fdr0.2$ID,file="HV22Gs_ttgA_fdr0.2.txt",quote = FALSE)
write.table(ttgB_fdr0.2$ID,file="HV22Gs_ttgB_fdr0.2.txt",quote = FALSE)
write.table(ttgA_pvalue0.01$ID,file="HV22Gs_ttgA_pval0.01.txt",quote = FALSE)
write.table(ttgB_pvalue0.01$ID,file="HV22Gs_ttgB_pval0.01.txt",quote = FALSE)
#get all permutations of 12 data points -->  calculate cv2 using consecutive dps --> how does it compare to the actual cv2?

#permutations(n=12,r=12,v=1:12,repeats.allowed = FALSE)
#too many combinations and becomes computationally infeasible

n_perms_cv2<-function(row){
  N<-100000
  cv2s<-rep(0,N)
  for (rep in seq(N)){
    perm<-sample(row,size=12,replace=FALSE)
    cv2s[rep]<-intergenerational_sd(perm)
    }
  obs_cv2<-intergenerational_sd(row)
  p_value<-(length(which(cv2s<=obs_cv2))+1)/N
  return(p_value)
}


n_perms_cv2_morevar<-function(row){
  N<-100000
  cv2s<-rep(0,N)
  for (rep in seq(N)){
    perm<-sample(row,size=12,replace=FALSE)
    cv2s[rep]<-intergenerational_sd(perm)
    }
  obs_cv2<-intergenerational_sd(row)
  p_value<-(length(which(cv2s>=obs_cv2))+1)/N
  return(p_value)
}


#calculate pvals genome-wide (just ran it once with 10^5 reps and now I'm opening the saved file)

atleast_ten_ttgs_A<-dat_ttgA[which(dat_ttgA$mean>=10),"ID"]
atleast_ten_ttgs_B<-dat_ttgA[which(dat_ttgB$mean>=10),"ID"]
write.table(union(atleast_ten_ttgs_A,atleast_ten_ttgs_B),file="at_least_10_22Gs_in_atleast_onelin.txt")

#removed genes with <10 mean 22Gs to gain statistical power

#ttg_A_lessvar_p<-apply(ttg_A_DEseqnorm[which(rownames(ttg_A_DEseqnorm) %in% atleast_ten_ttgs_A),],MARGIN = 1,FUN=n_perms_cv2)
#ttg_B_lessvar_p<-apply(ttg_B_DEseqnorm[which(rownames(ttg_B_DEseqnorm) %in% atleast_ten_ttgs_B),],MARGIN = 1,FUN=n_perms_cv2)

# 
# morevar_p<-apply(ttg_A_DEseqnorm[which(rownames(ttg_A_DEseqnorm) %in% atleast_ten_ttgs),],MARGIN = 1,FUN=n_perms_cv2_morevar)
# #morevar_padj<-p.adjust(morevar_p,method = "fdr")
#  
#write.table(ttg_A_lessvar_p,file="ttgA_pvalues_lessvar_100000reps.txt",quote=FALSE)
#write.table(ttg_B_lessvar_p,file="ttgB_pvalues_lessvar_100000reps.txt",quote=FALSE)

#write.table(morevar_p,file="../gene_lists/pvalues_morevar_100000reps.txt",quote=FALSE)

# 
# #open saved files
# 
lessvar_p_A<-read.table("ttgA_pvalues_lessvar_100000reps.txt")
lessvar_df_A<-data.frame(pval=lessvar_p_A$x,padj=p.adjust(lessvar_p_A$x,method="fdr"),ID=rownames(lessvar_p_A))

lessvar_p_B<-read.table("ttgB_pvalues_lessvar_100000reps.txt")
lessvar_df_B<-data.frame(pval=lessvar_p_B$x,padj=p.adjust(lessvar_p_B$x,method="fdr"),ID=rownames(lessvar_p_B))


ggplot(lessvar_df_A)+geom_histogram(aes(x=pval))+theme_classic()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggsave("inter_vs_overall_linA_pvalhist.pdf",dpi="retina")
## Saving 7 x 5 in image
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(lessvar_df_A)+geom_histogram(aes(x=padj))+theme_classic()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggsave("inter_vs_overall_linA_padjhist.pdf",dpi="retina")
## Saving 7 x 5 in image
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(lessvar_df_B)+geom_histogram(aes(x=pval))+theme_classic()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggsave("inter_vs_overall_linB_pvalhist.pdf",dpi="retina")
## Saving 7 x 5 in image
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(lessvar_df_B)+geom_histogram(aes(x=padj))+theme_classic()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggsave("inter_vs_overall_linB_padjhist.pdf",dpi="retina")
## Saving 7 x 5 in image
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
length(which(lessvar_df_A$padj<0.1))
## [1] 265
length(which(lessvar_df_B$padj<0.1))
## [1] 387
length(which(lessvar_df_A$padj<0.2))
## [1] 666
length(which(lessvar_df_B$padj<0.2))
## [1] 1021
length(intersect(lessvar_df_A[which(lessvar_df_A$padj<0.1),"ID"],lessvar_df_B[which(lessvar_df_B$padj<0.1),"ID"]))
## [1] 70
length(union(lessvar_df_A[which(lessvar_df_A$padj<0.1),"ID"],lessvar_df_B[which(lessvar_df_B$padj<0.1),"ID"]))
## [1] 582
#188 in lineage A, 354 in lineage B, overlap 58

length(intersect(lessvar_df_A[which(lessvar_df_A$padj<0.2),"ID"],lessvar_df_B[which(lessvar_df_B$padj<0.2),"ID"]))
## [1] 216
length(union(lessvar_df_A[which(lessvar_df_A$padj<0.2),"ID"],lessvar_df_B[which(lessvar_df_B$padj<0.2),"ID"]))
## [1] 1471
#565 in lineage A, 848 in lineage B, overlap 203


write.table(lessvar_df_A[which(lessvar_df_A$padj<0.1),"ID"],file="inter_overall_genes_linA_FDR0.1.txt")
write.table(lessvar_df_B[which(lessvar_df_B$padj<0.1),"ID"],file="inter_overall_genes_linB_FDR0.1.txt")
write.table(union(lessvar_df_A[which(lessvar_df_A$padj<0.1),"ID"],lessvar_df_B[which(lessvar_df_B$padj<0.1),"ID"]),file="inter_overall_genes_linsAB_FDR0.1.txt")

write.table(lessvar_df_A[,"ID"],file="inter_overall_genes_linA_BACKGROUND_LIST.txt")
write.table(lessvar_df_B[,"ID"],file="inter_overall_genes_linB_BACKGROUND_LIST.txt")
removegenes<-read.table("removegenes.txt"); removegenes<-as.character(removegenes$x)

lessvar_df_A_nobatch<-lessvar_df_A[-which(lessvar_df_A$ID %in% removegenes),]
lessvar_df_B_nobatch<-lessvar_df_B[-which(lessvar_df_B$ID %in% removegenes),]

ggplot(lessvar_df_A_nobatch)+geom_histogram(aes(x=pval))+theme_classic()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggsave("inter_vs_overall_linA_pvalhist_nobatch.pdf",dpi="retina")
## Saving 7 x 5 in image
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(lessvar_df_A_nobatch)+geom_histogram(aes(x=padj))+theme_classic()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggsave("inter_vs_overall_linA_padjhist_nobatch.pdf",dpi="retina")
## Saving 7 x 5 in image
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(lessvar_df_B_nobatch)+geom_histogram(aes(x=pval))+theme_classic()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggsave("inter_vs_overall_linB_pvalhist_nobatch.pdf",dpi="retina")
## Saving 7 x 5 in image
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(lessvar_df_B_nobatch)+geom_histogram(aes(x=padj))+theme_classic()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggsave("inter_vs_overall_linB_padjhist_nobatch.pdf",dpi="retina")
## Saving 7 x 5 in image
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
length(which(lessvar_df_A_nobatch$padj<0.1))
## [1] 167
length(which(lessvar_df_B_nobatch$padj<0.1))
## [1] 253
length(which(lessvar_df_A_nobatch$padj<0.2))
## [1] 447
length(which(lessvar_df_B_nobatch$padj<0.2))
## [1] 743
length(intersect(lessvar_df_A_nobatch[which(lessvar_df_A_nobatch$padj<0.1),"ID"],lessvar_df_B_nobatch[which(lessvar_df_B_nobatch$padj<0.1),"ID"]))
## [1] 36
length(union(lessvar_df_A_nobatch[which(lessvar_df_A_nobatch$padj<0.1),"ID"],lessvar_df_B_nobatch[which(lessvar_df_B_nobatch$padj<0.1),"ID"]))
## [1] 384
#114 in lineage A, 237 in lineage B, overlap 30, union 321

length(intersect(lessvar_df_A_nobatch[which(lessvar_df_A_nobatch$padj<0.2),"ID"],lessvar_df_B_nobatch[which(lessvar_df_B_nobatch$padj<0.2),"ID"]))
## [1] 105
length(union(lessvar_df_A_nobatch[which(lessvar_df_A_nobatch$padj<0.2),"ID"],lessvar_df_B_nobatch[which(lessvar_df_B_nobatch$padj<0.2),"ID"]))
## [1] 1085
#364 in lineage A, 606 in lineage B, overlap 97, union 873


write.table(lessvar_df_A_nobatch[which(lessvar_df_A_nobatch$padj<0.1),"ID"],file="inter_overall_genes_linA_FDR0.1_nobatch.txt")
write.table(lessvar_df_B_nobatch[which(lessvar_df_B_nobatch$padj<0.1),"ID"],file="inter_overall_genes_linB_FDR0.1_nobatch.txt")
write.table(union(lessvar_df_A_nobatch[which(lessvar_df_A_nobatch$padj<0.1),"ID"],lessvar_df_B_nobatch[which(lessvar_df_B_nobatch$padj<0.1),"ID"]),file="inter_overall_genes_linsAB_FDR0.1_nobatch.txt")
top20<-union(lessvar_df_A[order(lessvar_df_A$padj),"ID"][1:20],lessvar_df_B[order(lessvar_df_B$padj),"ID"][1:20])

for (gene in top20){
plot(1:24,as.numeric(ttg_DEseqnorm[gene,1:24]),main=gene)
}